library(readxl)
library(tidyverse)
library(plotly)
library(ggthemes)
library(gridExtra)
library(manipulateWidget)
library(ggplot2)
library(knitr)
library(DT)
library(maps)
library(dplyr)
library(RColorBrewer)
library(plyr)
library(fossil)
library(geosphere)
library(tm)
library(quanteda)
library(tidytext)
library(stringr)
library(SnowballC)
library(wordcloud)
library(stringi)
library(tidytext)
library(plotrix)
opts_chunk$set(fig.path="images/",
cache.path="cache/",
cache=FALSE,
echo=FALSE,
message=FALSE,
warning=FALSE)
Kickstarter is an American public-benefit corporation based in Brooklyn, New York, that maintains a global crowd funding platform focused on creativity. The company’s stated mission is to “help bring creative projects to life”.
Kickstarter has reportedly received more than $1.9 billion in pledges from 9.4 million backers to fund 257,000 creative projects, such as films, music, stage shows, comics, journalism, video games, technology and food-related projects.
For this assignment, I am asking you to analyze the descriptions of kickstarter projects to identify commonalities of successful (and unsuccessful projects) using the text mining techniques we covered in the past two lectures.
The dataset for this assignment is taken from webroboto.io ???s repository. They developed a scrapper robot that crawls all Kickstarter projects monthly since 2009. We will just take data from the most recent crawl on 2018-02-15.
To simplify your task, I have downloaded the files and partially cleaned the scraped data. In particular, I converted several JSON columns, corrected some obvious data issues, and removed some variables that are not of interest (or missing frequently). I have also subsetted the data to only contain projects originating in the United States (to have only English language and USD denominated projects).
The data is contained in the file kickstarter_projects.csv and contains about 150,000 projects and about 20 variables. z ## Tasks for the Assignment
#Using Achievement Ratio to identify the successful projects
kick_starter<- read_csv("./kickstarter_projects_1.csv")
#kick_starter <- readLines("kick_starter.txt", sep="/")
#kick_starter <- readLines("kick_starter.txt", sep="/")
#kick_starter <- stri_enc_toutf8(kick_starter, is_unknown_8bit = T)
na.omit(kick_starter)
## # A tibble: 134,705 x 23
## backers_count blurb converted_pledg~ country created_at currency
## <int> <chr> <int> <chr> <chr> <chr>
## 1 80 I will be a~ 3596 USA 07/01/12 USD
## 2 80 Surrealisti~ 3125 USA 24/03/12 USD
## 3 82 1000 Artist~ 4586 USA 05/03/12 USD
## 4 31 P.M.A.F.T.W~ 1036 USA 23/03/12 USD
## 5 21 "The Sequel~ 5217 USA 21/03/12 USD
## 6 153 We need to ~ 15445 USA 06/02/12 USD
## 7 18 Source Mate~ 2190 USA 08/03/12 USD
## 8 148 A series of~ 16115 USA 25/01/12 USD
## 9 164 Become a pa~ 12643 USA 25/03/12 USD
## 10 479 Portrait ar~ 62736 USA 09/03/12 USD
## # ... with 134,695 more rows, and 17 more variables: deadline <chr>,
## # goal <dbl>, id <int>, is_starrable <lgl>, launched_at <chr>,
## # name <chr>, pledged <dbl>, slug <chr>, source_url <chr>,
## # spotlight <lgl>, staff_pick <lgl>, state <chr>,
## # state_changed_at <chr>, location_town <chr>, location_state <chr>,
## # top_category <chr>, sub_category <chr>
#remove the duplicate blurbs from kickstarter csv
kick_starter1<- kick_starter[!duplicated(kick_starter$id), ]
AR<- (kick_starter1$pledged/kick_starter1$goal)*100
kick_starter1$AR<-AR
AR_plot_table<- aggregate( AR ~ top_category,kick_starter1, mean)
plot_category<- ggplot(data= AR_plot_table, aes(x= top_category, y= AR))+geom_bar(position="dodge", stat="identity")+theme_economist()+theme(axis.text.x = element_text(angle=90, size=rel(0.8), hjust=1))+ggtitle("Achievement Ratio vs Categories")+ylab("Achievement Ratio") +xlab("Categories")
#interactive
ggplotly(plot_category)
us_states<- read_csv("./us.csv")
kick_starter1$location <- paste(kick_starter1$location_town, kick_starter1$location_state)
merge_df<-merge(kick_starter1,us_states,by="location")
merge_df<- arrange(merge_df,desc(AR))
successful_df<- filter(merge_df,state=="successful")
state_count<- count(successful_df, "state_name")
state_count<- arrange(state_count,desc(freq))
plot_s_states<- ggplot(data= state_count, aes(x= state_name, y= freq))+geom_bar(position="dodge", stat="identity")+theme_economist()+theme(axis.text.x = element_text(angle=90, size=rel(0.8), hjust=1))+ggtitle("Successful projects by states")+ylab("Frequency") +xlab("States")
ggplotly(plot_s_states)
# top 50 cities
town_plot_table<- aggregate( AR ~ location_town.x,merge_df, mean )
town_plot_table<- arrange(town_plot_table,desc(AR))
top_50_cities<-head(town_plot_table,50)
plot_50_cities<- ggplot(data=top_50_cities, aes(x= location_town.x, y= AR))+geom_bar(position="dodge", stat="identity")+theme_economist()+theme(axis.text.x = element_text(angle=90, size=rel(0.8), hjust=1))+ggtitle("Top 50 Cities")+ylab("Achievement Ratio") +xlab("City Name")
ggplotly(plot_50_cities)
#map:
state_map<-merge(state_count,merge_df,by="state_name")
state_map<- state_map[!duplicated(state_map$state_name), ]
city_map<- merge(top_50_cities,merge_df, by="location_town.x")
city_map<- city_map[!duplicated(city_map$location_town.x), ]
pop1<- paste("State Name:", state_map$state_name, "<br/>")
pop2<- paste("City Name:", city_map$location_town.x, "<br/>")
map_innovative<- leaflet() %>%setView(lng = -95.7129, lat = 37.0902, zoom = 3) %>% addTiles() %>% addCircleMarkers(data=state_map,lng = ~lng, lat = ~lat,color="blue",popup =pop1 ) %>% addCircleMarkers(data=city_map,lng = ~lng, lat = ~lat, radius = 5, stroke = 2, opacity = 0.7,popup = pop2,color="orange")%>% addProviderTiles(providers$Stamen.Toner)
map_innovative
top_df<-kick_starter1[order(-AR),]
top_df<-filter(top_df, state=="successful")
top_1000<-head(top_df,1000)
last_df<- kick_starter1[order(AR),]
last_df<-filter(last_df, state=="failed")
last_1000<- head(last_df,1000)
top1000_corpus_blurb <- Corpus(VectorSource(top_1000$blurb))
last1000_corpus_blurb <- Corpus(VectorSource(last_1000$blurb))
clean_corpus <- function(corpus){
corpus <- tm_map(corpus, content_transformer(function(x) gsub(x, pattern = "<br>", replacement = "")))
corpus <- tm_map(corpus, content_transformer(function(x) gsub(x, pattern = "\n", replacement = "")))
corpus <- tm_map(corpus, content_transformer(function(x) gsub(x, pattern = "/><br", replacement = "")))
corpus <- tm_map(corpus, content_transformer(function(x) tolower(x)))
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, stripWhitespace)
return(corpus)
}
top1000_corpus_blurb_clean <- clean_corpus(top1000_corpus_blurb)
last1000_corpus_blurb_clean <- clean_corpus(last1000_corpus_blurb)
top1000_corpus_blurb_stemmed <- tm_map(top1000_corpus_blurb_clean, stemDocument)
last1000_corpus_blurb_stemmed <- tm_map(last1000_corpus_blurb_clean, stemDocument)
stemCompletion2 <- function(x, dictionary) {
x <- unlist(strsplit(as.character(x), " "))
x <- x[x != ""]
x <- stemCompletion(x, dictionary=dictionary)
x <- paste(x, sep="", collapse=" ")
PlainTextDocument(stripWhitespace(x))
}
top_1000_comp<- lapply(top1000_corpus_blurb_stemmed,stemCompletion2,dictionary= top1000_corpus_blurb_clean)
last_1000_comp<-lapply(last1000_corpus_blurb_stemmed,stemCompletion2,dictionary= last1000_corpus_blurb_clean)
top_1000_comp_all <- as.VCorpus(top_1000_comp)
last_1000_comp_all <- as.VCorpus(last_1000_comp)
top_1000_comp_all_tfidf <- tidy(DocumentTermMatrix(top_1000_comp_all, control = list(weighting = weightTfIdf)))
last_1000_comp_all_tfidf <- tidy(DocumentTermMatrix(last_1000_comp_all, control = list(weighting = weightTfIdf)))
kick_starter_dtm1 <- DocumentTermMatrix(as.VCorpus(top_1000_comp_all))
#kick_starter_dtm1
kick_starter_m1.1 <- as.matrix(kick_starter_dtm1)
dim(kick_starter_m1.1)
## [1] 1000 3426
kick_starter_dtm2 <- DocumentTermMatrix(as.VCorpus(last_1000_comp_all) )
#kick_starter_dtm2
kick_starter_m1.2 <- as.matrix(kick_starter_dtm2)
dim(kick_starter_m1.2)
## [1] 1000 2894
#kick_starter_tdm3 <- TermDocumentMatrix(top_1000_comp_all)
#kick_starter_tdm3
#kick_starter_m2.1 <- as.matrix(kick_starter_tdm3)
#dim(kick_starter_m2.1)
#kick_starter_tdm4 <- TermDocumentMatrix(last_1000_comp_all)
#kick_starter_tdm4
#kick_starter_m2.2 <- as.matrix(kick_starter_tdm4)
#dim(kick_starter_m2.2)
kick_starter_td1<- tidy(kick_starter_dtm1)
#head(kick_starter_td1)
kick_starter_td2<- tidy(kick_starter_dtm2)
#head(kick_starter_td2)
kick_starter_tf_idf1 <- kick_starter_td1
#kick_starter_tf_idf1 <- kick_starter_td1 %>% bind_tf_idf(term, document, count) %>% arrange(desc(tf_idf))
kick_starter_tf_idf2 <- kick_starter_td2
#kick_starter_tf_idf2 <- kick_starter_td2 %>% bind_tf_idf(term, document, count) %>% arrange(desc(tf_idf))
#kick_starter_tf_idf1
#kick_starter_tf_idf2
#word cloud
term_frequency_DT1 <- kick_starter_tf_idf1
term_frequency_DT2 <- kick_starter_tf_idf2
term_frequency_DT1$state<- "successful"
term_frequency_DT2$state <- "unsuccessful"
top10_term_frequency_DT1<- head(arrange(term_frequency_DT1,desc(count)),10)
top10_term_frequency_DT2<- head(arrange(term_frequency_DT2,desc(count)),10)
final_tf_idf<- rbind(top10_term_frequency_DT1,top10_term_frequency_DT2)
tf_idf_merge_2000<- rbind(term_frequency_DT1,term_frequency_DT2)
purple_orange <- brewer.pal(10, "PuOr")
purple_orange <- purple_orange[-(1:2)]
par(mar=c(1,1,1,1))
Word Cloud of top 1000
set.seed(11)
wordcloud(top_1000_comp_all_tfidf$term,top_1000_comp_all_tfidf$count,scale=c(4,.5),min.freq=1,max.words = 500, colors = purple_orange)
Word Cloud of last 1000
set.seed(112)
wordcloud(last_1000_comp_all_tfidf$term,last_1000_comp_all_tfidf$count,scale=c(4,.5),min.freq=1,max.words = 500, colors = purple_orange)
####Referred https://stats.stackexchange.com/questions/2455/how-to-make-age-pyramid-like-plot-in-r
common <- merge(term_frequency_DT1,term_frequency_DT2,by="term")
common_final<- common[!duplicated(common$term), ]
common_final<-arrange(common_final,desc(count.x))
common_final<- head(common_final,20)
library(plotrix)
mcol<-color.gradient(c(0,0,0.5,1),c(0,0,0.5,1),c(1,1,0.5,1),18)
fcol<-color.gradient(c(1,1,0.5,1),c(0.5,0.5,0.5,1),c(0.5,0.5,0.5,1),18)
p <- pyramid.plot(common_final$count.x, common_final$count.y,labels = common_final$term,top.labels = c("successful", " ", "unsuccessful"),main = "Words in Common",lxcol=mcol,rxcol=fcol,gap=1,show.values=TRUE)
p
## [1] 5.1 4.1 4.1 2.1
#textstat_readability(c(s1,s2,s3),measure=c('Flesch','Flesch.Kincaid','meanSentenceLength','meanWordSyllables'))
kickstarter_success_Corpus <- corpus(top1000_corpus_blurb)
FRE_success <- textstat_readability(kickstarter_success_Corpus,measure =c('Flesch.Kincaid'))
#FRE_success
FRE_success$AR<- top_1000$AR
plot_FRE_AR<- ggplot(data=FRE_success, aes(x=AR , y= Flesch.Kincaid))+geom_point(alpha=0.5, size=3)+ theme_economist()+theme(axis.text.x = element_text(angle=90, size=rel(0.8), hjust=1))+ggtitle("AR vs FRE")+ylab("FRE") +xlab("AR")
ggplotly(plot_FRE_AR)
pos <- read.table("/Users/vigyashrote/Desktop/DV/DV_NYU_course_material-master 4/Exercises/09_kickstarter/dictionaries/positive-words.txt", as.is=T)
neg <- read.table("/Users/vigyashrote/Desktop/DV/DV_NYU_course_material-master 4/Exercises/09_kickstarter/dictionaries/negative-words.txt", as.is=T)
#pos[1:15,]
#neg[1:15,]
# function just to do simply arithmetic
sentiment <- function(words=c("really great good stuff bad")){
require(quanteda)
tok <- quanteda::tokens(words)
pos.count <- sum(tok[[1]]%in%pos[,1] + 1)
neg.count <- sum(tok[[1]]%in%neg[,1] + 1)
out <- (pos.count - neg.count)/(pos.count+neg.count)
return(out)
}
for (i in 1:1000)
{
top_1000$Tone[i] <- sentiment(top_1000$blurb[i])
}
plot_sentiment<- ggplot(data=top_1000, aes(x=Tone , y= AR ))+geom_point(alpha=0.5, size=3)+ theme_economist()+theme(axis.text.x = element_text(angle=90, size=rel(0.8), hjust=1))+ggtitle("Tone of the Blurb vs AR")+ylab("AR") +xlab("Tone of the Blurb")
ggplotly(plot_sentiment)
Comparision cloud
for (i in 1:1000)
{
last_1000$Tone[i] <- sentiment(last_1000$blurb[i])
}
top_last_2000<- rbind(top_1000, last_1000)
top_last_2000$ToneType<- ifelse(((as.numeric(as.character(top_last_2000$Tone))) < 0), "Negative", "Positive")
tryTolower = function(x)
{
# create missing value
# this is where the returned value will be
y = NA
# tryCatch error
try_error = tryCatch(tolower(x), error = function(e) e)
# if not an error
if (!inherits(try_error, "error"))
y = tolower(x)
return(y)
}
replacePunctuation<- function(x){
gsub("[[:punct:]]+"," ",x)
}
create_clean_corpus <- function(text_vector){
# Clean a text vector
text_corpus <- VCorpus(VectorSource(text_vector))
text_corpus_clean<-sapply(text_corpus, function(x) tryTolower(x))
text_corpus_clean <- VCorpus(VectorSource(text_corpus_clean))
text_corpus_clean <- tm_map(text_corpus_clean, content_transformer(tolower))
text_corpus_clean <- tm_map(text_corpus_clean, removeNumbers)
text_corpus_clean <- tm_map(text_corpus_clean, removeWords,c(stopwords()))
text_corpus_clean <- tm_map(text_corpus_clean, content_transformer(replacePunctuation))
text_corpus_clean <- tm_map(text_corpus_clean, stemDocument,language="english")
text_corpus_clean <- tm_map(text_corpus_clean, stripWhitespace)
return(text_corpus_clean)
}
top_last_2000_positive <- paste(top_last_2000$blurb[top_last_2000$ToneType =="Positive"],collapse = " ")
top_last_2000_negative <- paste(top_last_2000$blurb[top_last_2000$ToneType == "Negative"],collapse = " ")
corp_pos_neg <- c(top_last_2000_positive, top_last_2000_negative)
all_blurbs <- create_clean_corpus(corp_pos_neg)
all_blurbs_pos_neg <- TermDocumentMatrix(all_blurbs, control=list(weighting = weightTfIdf))
colnames(all_blurbs_pos_neg) <- c("Positive", "Negative")
all_blurbs_pos_neg <- as.matrix(all_blurbs_pos_neg)
comparison.cloud(all_blurbs_pos_neg, colors = c("red", "blue"), max.words = 100)
nrc_dictionary <- get_sentiments("nrc")
joy11<- subset(nrc_dictionary, sentiment=="joy")
fear11 <- subset(nrc_dictionary, sentiment=="fear")
anger11 <- subset(nrc_dictionary, sentiment=="anger")
anticipation11 <- subset(nrc_dictionary, sentiment=="anticipation")
sadness11 <- subset(nrc_dictionary, sentiment=="sadness")
surprise11 <- subset(nrc_dictionary, sentiment=="surprise")
mood_sentiment <- function(words=c("really great good stuff bad")){
require(quanteda)
tok_mood <- quanteda::tokens(words)
joy.count <- sum(tok_mood[[1]]%in%joy11[,1])
sadness.count <- sum(tok_mood[[1]]%in%sadness11[,1])
fear.count <- sum(tok_mood[[1]]%in%fear11[,1])
surprise.count <- sum(tok_mood[[1]]%in%surprise11[,1])
anger.count <- sum(tok_mood[[1]]%in%anger11[,1])
anticipation.count <- sum(tok_mood[[1]]%in%anticipation11[,1])
out <- max(joy.count,sadness.count,fear.count,surprise.count,anger.count, anticipation.count)
return(out)
}
for (i in 1:1000){top_1000$Mood_Sentiment[i] <- mood_sentiment(top_1000$blurb[i])}